In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#Configurações matplotlib e seaborn
plt.rcParams['figure.figsize'] = (16,10)
plt.style.use('seaborn-darkgrid')

import warnings
warnings.filterwarnings('ignore')
In [3]:
books = pd.read_csv("Dados_Recomendacao - Livro/Books.csv")
ratings = pd.read_csv("Dados_Recomendacao - Livro/Ratings.csv")
users = pd.read_csv("Dados_Recomendacao - Livro/Users.csv")

Modelagem dos dados¶

In [5]:
# Dimensão dos dados
books.shape, ratings.shape, users.shape
Out[5]:
((271360, 8), (1149780, 3), (278858, 3))
In [6]:
books.head(3)
Out[6]:
ISBN Book-Title Book-Author Year-Of-Publication Publisher Image-URL-S Image-URL-M Image-URL-L
0 0195153448 Classical Mythology Mark P. O. Morford 2002 Oxford University Press http://images.amazon.com/images/P/0195153448.0... http://images.amazon.com/images/P/0195153448.0... http://images.amazon.com/images/P/0195153448.0...
1 0002005018 Clara Callan Richard Bruce Wright 2001 HarperFlamingo Canada http://images.amazon.com/images/P/0002005018.0... http://images.amazon.com/images/P/0002005018.0... http://images.amazon.com/images/P/0002005018.0...
2 0060973129 Decision in Normandy Carlo D'Este 1991 HarperPerennial http://images.amazon.com/images/P/0060973129.0... http://images.amazon.com/images/P/0060973129.0... http://images.amazon.com/images/P/0060973129.0...
In [7]:
books.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB
In [8]:
ratings.head(3)
Out[8]:
User-ID ISBN Book-Rating
0 276725 034545104X 0
1 276726 0155061224 5
2 276727 0446520802 0
In [9]:
ratings.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB
In [10]:
users.head(3)
Out[10]:
User-ID Location Age
0 1 nyc, new york, usa NaN
1 2 stockton, california, usa 18.0
2 3 moscow, yukon territory, russia NaN
In [11]:
users.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB
In [12]:
# Cruzamento dos dados
dados_cruzados = pd.merge(ratings,users,'inner','User-ID')
dados_cruzados = pd.merge(dados_cruzados,books,'inner','ISBN')
In [13]:
dados_cruzados.head(3)
Out[13]:
User-ID ISBN Book-Rating Location Age Book-Title Book-Author Year-Of-Publication Publisher Image-URL-S Image-URL-M Image-URL-L
0 276725 034545104X 0 tyler, texas, usa NaN Flesh Tones: A Novel M. J. Rose 2002 Ballantine Books http://images.amazon.com/images/P/034545104X.0... http://images.amazon.com/images/P/034545104X.0... http://images.amazon.com/images/P/034545104X.0...
1 2313 034545104X 5 cincinnati, ohio, usa 23.0 Flesh Tones: A Novel M. J. Rose 2002 Ballantine Books http://images.amazon.com/images/P/034545104X.0... http://images.amazon.com/images/P/034545104X.0... http://images.amazon.com/images/P/034545104X.0...
2 6543 034545104X 0 strafford, missouri, usa 34.0 Flesh Tones: A Novel M. J. Rose 2002 Ballantine Books http://images.amazon.com/images/P/034545104X.0... http://images.amazon.com/images/P/034545104X.0... http://images.amazon.com/images/P/034545104X.0...
In [14]:
dados_cruzados.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031136 entries, 0 to 1031135
Data columns (total 12 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   User-ID              1031136 non-null  int64  
 1   ISBN                 1031136 non-null  object 
 2   Book-Rating          1031136 non-null  int64  
 3   Location             1031136 non-null  object 
 4   Age                  753301 non-null   float64
 5   Book-Title           1031136 non-null  object 
 6   Book-Author          1031135 non-null  object 
 7   Year-Of-Publication  1031136 non-null  object 
 8   Publisher            1031134 non-null  object 
 9   Image-URL-S          1031136 non-null  object 
 10  Image-URL-M          1031136 non-null  object 
 11  Image-URL-L          1031132 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 102.3+ MB
In [15]:
dados_cruzados['Year-Of-Publication'][dados_cruzados['Year-Of-Publication'] =='DK Publishing Inc']
Out[15]:
911154    DK Publishing Inc
949657    DK Publishing Inc
949658    DK Publishing Inc
Name: Year-Of-Publication, dtype: object
In [16]:
dados_cruzados.iloc[911154,7] = ""
dados_cruzados.iloc[949657,7] = ""
dados_cruzados.iloc[949658,7] = ""
dados_cruzados.iloc[918145,7] = ""
In [17]:
# Convertendo coluna ano em dados numéticos
dados_cruzados['Year-Of-Publication'] = pd.to_numeric(dados_cruzados['Year-Of-Publication'])
In [18]:
# Extraindo país da coluna "Location"
dados_cruzados['Location'] = dados_cruzados['Location'].apply(lambda x: x.split(", ")[-1].upper() )  

Exploração dos dados¶

In [19]:
dados_cruzados.describe()
Out[19]:
User-ID Book-Rating Age Year-Of-Publication
count 1.031136e+06 1.031136e+06 753301.000000 1.031132e+06
mean 1.405945e+05 2.839051e+00 37.397648 1.968195e+03
std 8.052466e+04 3.854157e+00 14.098254 2.311015e+02
min 2.000000e+00 0.000000e+00 0.000000 0.000000e+00
25% 7.041500e+04 0.000000e+00 28.000000 1.992000e+03
50% 1.412100e+05 0.000000e+00 35.000000 1.997000e+03
75% 2.114260e+05 7.000000e+00 45.000000 2.001000e+03
max 2.788540e+05 1.000000e+01 244.000000 2.050000e+03
In [20]:
# Remover avaliaç~es zeradas
dados_cruzados = dados_cruzados[dados_cruzados['Book-Rating'] > 0]
dados_cruzados.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 383842 entries, 1 to 1031135
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   User-ID              383842 non-null  int64  
 1   ISBN                 383842 non-null  object 
 2   Book-Rating          383842 non-null  int64  
 3   Location             383842 non-null  object 
 4   Age                  269621 non-null  float64
 5   Book-Title           383842 non-null  object 
 6   Book-Author          383841 non-null  object 
 7   Year-Of-Publication  383841 non-null  float64
 8   Publisher            383840 non-null  object 
 9   Image-URL-S          383842 non-null  object 
 10  Image-URL-M          383842 non-null  object 
 11  Image-URL-L          383841 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 38.1+ MB
In [21]:
dados_cruzados.describe()
Out[21]:
User-ID Book-Rating Age Year-Of-Publication
count 383842.000000 383842.000000 269621.000000 383841.000000
mean 136031.461260 7.626701 36.835829 1965.636678
std 80482.299401 1.841339 13.753045 243.221296
min 8.000000 1.000000 0.000000 0.000000
25% 67591.000000 7.000000 28.000000 1992.000000
50% 133789.000000 8.000000 35.000000 1997.000000
75% 206219.000000 9.000000 45.000000 2001.000000
max 278854.000000 10.000000 244.000000 2050.000000
In [33]:
plt.figure(figsize=(7, 4))
plt.title("Análise avaliações")
sns.boxplot(x = 'Book-Rating', data = dados_cruzados)
Out[33]:
<AxesSubplot:title={'center':'Análise avaliações'}, xlabel='Book-Rating'>
In [34]:
# Analise
analise = dados_cruzados.groupby('Book-Title').agg(
    Quantidade = ('Book-Title', 'count'),
    Média = ('Book-Rating','mean'),
    Max = ('Book-Rating','max'),
    Min = ('Book-Rating','min'),
    Mediana = ('Book-Rating','median')
)

analise.head()
Out[34]:
Quantidade Média Max Min Mediana
Book-Title
A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America) 1 9.000000 9 9 9.0
Ask Lily (Young Women of Faith: Lily Series, Book 5) 1 8.000000 8 8 8.0
Dark Justice 1 10.000000 10 10 10.0
Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth 7 7.142857 10 1 7.0
Final Fantasy Anthology: Official Strategy Guide (Brady Games) 2 10.000000 10 10 10.0
In [35]:
analise.sort_values("Quantidade", ascending = False).head(10)
Out[35]:
Quantidade Média Max Min Mediana
Book-Title
The Lovely Bones: A Novel 707 8.185290 10 1 8.0
Wild Animus 581 4.390706 10 1 4.0
The Da Vinci Code 494 8.439271 10 1 9.0
The Secret Life of Bees 406 8.477833 10 2 9.0
The Nanny Diaries: A Novel 393 7.437659 10 1 8.0
The Red Tent (Bestselling Backlist) 383 8.182768 10 2 9.0
Bridget Jones's Diary 377 7.625995 10 1 8.0
A Painted House 366 7.398907 10 1 8.0
Life of Pi 336 8.080357 10 1 8.0
Harry Potter and the Chamber of Secrets (Book 2) 326 8.840491 10 4 9.0
In [36]:
# Analise qtd x avaliação

px.scatter(data_frame = analise, x = 'Quantidade', y= 'Média', title = 'Méxia x Quantidade')
In [38]:
analise['Quantidade'].describe()
# Verifica-se que a maioria dos dados possuem poucas avaliações
Out[38]:
count    135567.000000
mean          2.831382
std           9.135691
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         707.000000
Name: Quantidade, dtype: float64
In [39]:
def Classificao_Quantidade( Quantidade ):
  '''
    Agrupar a quantidade
  '''

  if int( Quantidade ) <= 5:
    return '1-5 Avaliações'

  elif int( Quantidade) <=10:
    return '6-10 Avaliações'

  elif int(Quantidade) <= 50:
    return '11-50 Avaliações'

  elif int(Quantidade) <= 100:
    return '51-100 Avaliações'

  else:
    return '>101 Avaliações'

# Aplicação
Pizza = analise['Quantidade'].apply( Classificao_Quantidade ).value_counts( normalize=True )

# Tranformar em um DataFrame
Pizza = pd.DataFrame( Pizza ).reset_index()

# Plot
px.pie( 
    # DAdos
    data_frame=Pizza,
    # Paramewtros
    names='index', values='Quantidade',
    # Titulo
    title='Divisão das Quantidades'
)
In [40]:
analise['Quantidade'][analise['Quantidade']>50].count()
Out[40]:
626
In [41]:
# Eliminando livros com menos de 50 avaliações
analise.reset_index(inplace = True)
dados_cruzados = dados_cruzados.merge(analise[['Book-Title','Quantidade']],on = 'Book-Title')
dados_cruzados = dados_cruzados[dados_cruzados['Quantidade'] >= 50]
dados_cruzados.shape
Out[41]:
(65477, 13)
In [42]:
# Publicação
analise_ano = dados_cruzados['Year-Of-Publication'].value_counts().sort_index().reset_index()
analise_ano.head()
Out[42]:
index Year-Of-Publication
0 0.0 336
1 1920.0 1
2 1938.0 6
3 1943.0 3
4 1948.0 2
In [43]:
filtro = analise_ano.loc[(analise_ano['index'] > 1990) & (analise_ano['index'] < 2020)]
#Plot
plt.figure(figsize=(20,10))
plt.title('Analisando ano de publicação')
plt.bar(filtro['index'], filtro['Year-Of-Publication'])
Out[43]:
<BarContainer object of 16 artists>
In [44]:
# Autores
dados_cruzados.groupby('Book-Author').agg(Quantidade=('Book-Rating','count'), Media=('Book-Rating','mean')).sort_values('Quantidade', ascending = False)
Out[44]:
Quantidade Media
Book-Author
Stephen King 3326 7.787432
John Grisham 2379 7.526692
James Patterson 2010 7.710945
J. K. Rowling 1552 8.994845
Janet Evanovich 1225 7.968980
... ... ...
Irving John 1 6.000000
Patricia Springer 1 8.000000
Patricia Potter 1 9.000000
Patricia Highsmith 1 6.000000
John Harvey 1 8.000000

482 rows × 2 columns

In [45]:
# Concentração de avaliações
dados_cruzados['Location'].value_counts(normalize = True).head(20)*100
Out[45]:
USA               76.797960
CANADA             9.406356
,                  2.304626
UNITED KINGDOM     2.177864
AUSTRALIA          1.376056
N/A                0.934679
GERMANY            0.884280
PORTUGAL           0.722391
MALAYSIA           0.661301
SPAIN              0.494830
NETHERLANDS        0.365014
NEW ZEALAND        0.282542
FRANCE             0.244361
ITALY              0.207707
SINGAPORE          0.171052
PHILIPPINES        0.157307
CALIFORNIA,        0.123708
SWITZERLAND        0.119126
AUSTRIA            0.116071
JAPAN              0.111490
Name: Location, dtype: float64
In [46]:
# Substituindo ruídos nos países
dados_cruzados['Location'].replace(',','', inplace = True)
dados_cruzados['Location'].replace('N/A','', inplace = True)
dados_cruzados['Location'].replace('CALIFORNIA,','USA', inplace = True)
In [51]:
# Idade
plt.figure(figsize=(20,10))
plt.title('Distribuição idades')
sns.boxplot(data = dados_cruzados,x= 'Age')
Out[51]:
<AxesSubplot:title={'center':'Distribuição idades'}, xlabel='Age'>
In [52]:
# Serão eliminados dados de idades abaixo de 10 e acima de 100 anos
dados_cruzados = dados_cruzados.loc[(dados_cruzados['Age'] >= 10) & (dados_cruzados['Age'] < 100)]

Criação do modelo¶

In [53]:
# Gera a matriz
matriz = dados_cruzados.pivot_table(values = 'Book-Rating', index = 'Book-Title', columns = 'User-ID')
matriz.fillna(0, inplace = True)

matriz.head()
Out[53]:
User-ID 42 51 99 114 125 165 183 185 242 254 ... 278755 278798 278800 278807 278824 278832 278836 278843 278844 278846
Book-Title
1984 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1st to Die: A Novel 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2nd Chance 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 Blondes 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
84 Charing Cross Road 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 14965 columns

In [54]:
# Transformação para vetores
from scipy.sparse import csc_matrix
matriz_sparse = csc_matrix(matriz)
In [55]:
# Criar o modelo
from sklearn.neighbors import NearestNeighbors

modelo = NearestNeighbors(
        # Quantidade de recomendações
        n_neighbors = 5,
        # Algoritmo
        algorithm = 'brute',
        )

# Fit modelo
modelo.fit(matriz_sparse)
Out[55]:
NearestNeighbors(algorithm='brute')
In [56]:
# Recomendações
selecionar_livro = matriz.iloc[213,:].values.reshape(1, -1)

# Previsão do modelo
distancia, recomendacao =  modelo.kneighbors(selecionar_livro)


# Ver as sugestões
for i in range(len(recomendacao)):
    livro = matriz.index[recomendacao[i]]
    print(livro)
Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Sorcerer's Stone (Book 1)',
       'The Shelters of Stone (Earth's Children Series, No 5)'],
      dtype='object', name='Book-Title')
In [58]:
# lista de URL's capa livros recomendados
import PIL
import urllib
import requests
import matplotlib.image as mpimg

urls = []
imagens = []
for i in range(len(recomendacao[0])):
    livro = matriz.index[recomendacao[0][i]]
    link = books.loc[books['Book-Title'] == livro].head(1)['Image-URL-L'].values[0]
    urls.append(link)
    imagens.append(PIL.Image.open(urllib.request.urlopen(link)))
In [59]:
# Costruir relatorio

titulos = ['Seleção','Recomendação 1', 'Recomendação 2', 'Recomendação 3', 'Recomendação 4']
import plotly.graph_objects as Go
from plotly.subplots import make_subplots
figure = make_subplots(
    rows = 1,
    cols = 5,
    subplot_titles = titulos
)

figure.update_layout(
    height = 500,
    width = 1200,
    title_text = 'Sistema de Recomendação',
    showlegend = False
)

# Plot imagens
col = 1
for i in imagens:
    figure.add_trace(Go.Image(z = i), row = 1, col = col)
    col = col+1

# Mostrar
figure.show()